# Load required libraries
library(readxl)
## Warning: package 'readxl' was built under R version 4.3.2
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(car)
## Warning: package 'car' was built under R version 4.3.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.2
library(rgl)
## Warning: package 'rgl' was built under R version 4.3.2
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Import data from Excel file
data <- read_excel("C:/Users/tzp3890/OneDrive - Western Michigan University/Project 2/GS1M.xlsx")
data2 <- read_excel("C:/Users/tzp3890/OneDrive - Western Michigan University/Project 2/DGS10.xlsx")
# Convert observation_date column to Date format
data$observation_date <- as.Date(data$observation_date)
data2$observation_date <- as.Date(data2$observation_date)
GS1M_values <- data$GS1M
DGS10_values <- data2$DGS10
### ACF for GS1M
acf(GS1M_values)
### PACF for GS1M
pacf(GS1M_values)
### ACF for GS1M
acf(DGS10_values)
### PACF for GS1M
pacf(DGS10_values)
pacf(diff(GS1M_values))
We see that each that both series are highly correlated for each of of their lags till at least lag-15 by looking at their respective ACFs but we see that if we remove the effect of shorter lags by looking at their PACFs, they are insignificant. We see that GS1M shares a slightly stronger relationship with its deeper lags than DGS10. The difference between the decreasing ACF lags and difference from PACF could suggest non stationary and using the first or second difference in model but for this time, I have decided to use the original values.
# Fit Moving Average (MA) Model
ma_model2 <- Arima(DGS10_values, order=c(0, 0, 1))
ma_model <- Arima(GS1M_values, order=c(0, 0, 1))
# Order(p, d, q): p=0 (AR), d=1 (differencing), q=1 (MA)
# Fit Autoregressive (AR) Model
ar_model2 <- Arima(DGS10_values, order=c(1, 0, 0))
ar_model <- Arima(GS1M_values, order=c(1, 0, 0))
# Order(p, d, q): p=1 (AR), d=0 (differencing), q=0 (MA)
# Summarizing MA Model (GSM1)
summary(ma_model)
## Series: GS1M_values
## ARIMA(0,0,1) with non-zero mean
##
## Coefficients:
## ma1 mean
## 0.9423 1.4231
## s.e. 0.0185 0.1054
##
## sigma^2 = 0.8067: log likelihood = -355.52
## AIC=717.03 AICc=717.12 BIC=727.84
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.002211166 0.8948397 0.7103834 -Inf Inf 5.991988 0.8276147
The mean used in our model in excel is close to the intercept shown in model. Also the using the coefficient from Q4 was a good guess, the value ¬0.99 is close to the optimized value of 0.9423. The AIC is extremely high which suggests the model doesn’t capture data well so should be used with caution.
# Summarizing AR Model (GSM1)
summary(ar_model)
## Series: GS1M_values
## ARIMA(1,0,0) with non-zero mean
##
## Coefficients:
## ar1 mean
## 0.9947 3.279
## s.e. 0.0048 1.819
##
## sigma^2 = 0.05095: log likelihood = 17.56
## AIC=-29.12 AICc=-29.03 BIC=-18.31
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.002941103 0.2248958 0.1202839 -Inf Inf 1.014578 0.2758153
I used the a linear regression in excel to estimate the coefficients but as we talked in class, this algorithm doesn’t optimize the coefficients to minimalize the error score so our values are different. Our slope coefficient is fairly similar but the intercept are very different.
# Summarizing MA Model (DGS10)
summary(ma_model2)
## Series: DGS10_values
## ARIMA(0,0,1) with non-zero mean
##
## Coefficients:
## ma1 mean
## 0.9651 2.2689
## s.e. 0.0052 0.0362
##
## sigma^2 = 0.4086: log likelihood = -1164.15
## AIC=2334.3 AICc=2334.32 BIC=2349.57
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0001189877 0.6387058 0.5620178 -21.73871 36.4544 12.45806
## ACF1
## Training set 0.9387602
Extremely high AIC. This share the same results as the moving average of GS1M as our coefficient and intercept are very similar.
# Summarizing AR Model (DGS10)
summary(ar_model2)
## Series: DGS10_values
## ARIMA(1,0,0) with non-zero mean
##
## Coefficients:
## ar1 mean
## 0.9988 2.2942
## s.e. 0.0012 1.0316
##
## sigma^2 = 0.003653: log likelihood = 1659.79
## AIC=-3313.58 AICc=-3313.56 BIC=-3298.31
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.001611576 0.06039056 0.04518806 -0.07034281 2.457796 1.001669
## ACF1
## Training set 0.00994548
All MA models can only forecast one value into the future (h=1) since they all only use 1 lag.
# Forecasting for GS1M - MA Model
ma_forecast <- forecast(ma_model, h = 1) # Forecast the next 10 steps
# Summarizing Model
summary(ma_forecast)
##
## Forecast method: ARIMA(0,0,1) with non-zero mean
##
## Model Information:
## Series: GS1M_values
## ARIMA(0,0,1) with non-zero mean
##
## Coefficients:
## ma1 mean
## 0.9423 1.4231
## s.e. 0.0185 0.1054
##
## sigma^2 = 0.8067: log likelihood = -355.52
## AIC=717.03 AICc=717.12 BIC=727.84
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.002211166 0.8948397 0.7103834 -Inf Inf 5.991988 0.8276147
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 272 4.04228 2.891241 5.193318 2.281918 5.802641
# Forecasting for GS1M - AR Model
ar_forecast <- forecast(ar_model, h = 10) # Forecast the next 10 steps
# Summarizing Model
summary(ar_forecast)
##
## Forecast method: ARIMA(1,0,0) with non-zero mean
##
## Model Information:
## Series: GS1M_values
## ARIMA(1,0,0) with non-zero mean
##
## Coefficients:
## ar1 mean
## 0.9947 3.279
## s.e. 0.0048 1.819
##
## sigma^2 = 0.05095: log likelihood = 17.56
## AIC=-29.12 AICc=-29.03 BIC=-18.31
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.002941103 0.2248958 0.1202839 -Inf Inf 1.014578 0.2758153
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 272 5.527968 5.238683 5.817253 5.085545 5.970391
## 273 5.516000 5.107977 5.924024 4.891982 6.140019
## 274 5.504096 5.005697 6.002495 4.741860 6.266332
## 275 5.492255 4.918277 6.066234 4.614431 6.370079
## 276 5.480477 4.840446 6.120509 4.501634 6.459321
## 277 5.468762 4.769493 6.168031 4.399323 6.538202
## 278 5.457110 4.703802 6.210417 4.305025 6.609195
## 279 5.445519 4.642316 6.248722 4.217126 6.673912
## 280 5.433990 4.584300 6.283680 4.134501 6.733478
## 281 5.422522 4.529216 6.315828 4.056329 6.788715
# Forecasting for DGS10 - MA Model
ma_forecast2 <- forecast(ma_model2, h = 1) # Forecast the next 10 steps
# Summarizing Model
summary(ma_forecast2)
##
## Forecast method: ARIMA(0,0,1) with non-zero mean
##
## Model Information:
## Series: DGS10_values
## ARIMA(0,0,1) with non-zero mean
##
## Coefficients:
## ma1 mean
## 0.9651 2.2689
## s.e. 0.0052 0.0362
##
## sigma^2 = 0.4086: log likelihood = -1164.15
## AIC=2334.3 AICc=2334.32 BIC=2349.57
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0001189877 0.6387058 0.5620178 -21.73871 36.4544 12.45806
## ACF1
## Training set 0.9387602
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 1199 3.093965 2.274746 3.913183 1.841078 4.346851
# Forecasting for DGS10 - AR Model
ar_forecast2 <- forecast(ar_model2, h = 10) # Forecast the next 10 steps
# Summarizing Model
summary(ar_forecast2)
##
## Forecast method: ARIMA(1,0,0) with non-zero mean
##
## Model Information:
## Series: DGS10_values
## ARIMA(1,0,0) with non-zero mean
##
## Coefficients:
## ar1 mean
## 0.9988 2.2942
## s.e. 0.0012 1.0316
##
## sigma^2 = 0.003653: log likelihood = 1659.79
## AIC=-3313.58 AICc=-3313.56 BIC=-3298.31
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.001611576 0.06039056 0.04518806 -0.07034281 2.457796 1.001669
## ACF1
## Training set 0.00994548
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 1199 4.067866 3.990407 4.145324 3.949403 4.186328
## 1200 4.065734 3.956257 4.175210 3.898303 4.233164
## 1201 4.063604 3.929604 4.197605 3.858668 4.268541
## 1202 4.061478 3.906840 4.216115 3.824980 4.297976
## 1203 4.059354 3.886567 4.232140 3.795100 4.323608
## 1204 4.057232 3.868067 4.246396 3.767930 4.346534
## 1205 4.055113 3.850914 4.259311 3.742818 4.367407
## 1206 4.052996 3.834830 4.271163 3.719339 4.386653
## 1207 4.050882 3.819620 4.282144 3.697198 4.404567
## 1208 4.048771 3.805145 4.292396 3.676178 4.421364
All models don’t seem to forecast very well as very converge very quickly.
Considering the models programmed in Q7, I would use the AR model for GS1M since it has a lower Root Mean Squared Error (RMSE). The MA model has a better ME scored but RMSE weighs higher errors more heavily which I’d like to avoid, hence why I have decided to use RMSE. For DGS10, I would also use an AR model for the same reason: a lower RMSE score.
Considering all models, I have a preference for the EMA(1) model for DGS10 because I find it easy to understand the mechanics behind and the combination of a good accuracy score and an adjustable smoothing factor allows for more insights. I value the smoothing for DGS10 over GS1M because DGS10 has a more consistent increase whilst the erratic peaks and inconsistent plateau of GS1M doesn’t work when decreasing the smooth factor.
I wasn’t sure about GS1M so I decided try and make my AR model more accurate by changing the parameters. I switched to using the first difference to help make the series more stationary. I also used up to 15 lags inferred from the ACF plots. I also tested up 20 lag but the error score wasn’t significantly impacted. This model is the most accurate out of the models across the error scores and also has a low AIC, capturing most of the data. Also, changing the no. of lags and the order of differencing had a positive of impact on the forecast which looks a lot more reliable.
# Plot values
plot(data$GS1M)
# Plot first differences
plot(diff(data$GS1M))
Series looks more stationary.
Box.test(data$GS1M, lag = 15, type = "Ljung")
##
## Box-Ljung test
##
## data: data$GS1M
## X-squared = 2188.5, df = 15, p-value < 2.2e-16
The lags exhibit serial correlation
# Fit Autoregressive (AR) Model
fin_mod <- Arima(GS1M_values, order=c(15, 1, 0))
# Order(p, d, q): p=5 (AR), d=1 (differencing), q=0 (MA)
summary(fin_mod)
## Series: GS1M_values
## ARIMA(15,1,0)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7 ar8 ar9
## 0.1860 0.0611 0.2019 0.1371 -0.1744 0.1911 0.1724 -0.0343 0.0972
## s.e. 0.0607 0.0631 0.0627 0.0643 0.0650 0.0651 0.0660 0.0679 0.0743
## ar10 ar11 ar12 ar13 ar14 ar15
## -0.1258 0.0322 0.126 -0.1692 0.0200 -0.0579
## s.e. 0.0755 0.0750 0.075 0.0737 0.0743 0.0715
##
## sigma^2 = 0.03943: log likelihood = 60.29
## AIC=-88.58 AICc=-86.43 BIC=-31
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.002635001 0.1926092 0.109349 -Inf Inf 0.9223443 -0.000969829
# Forecasting for AR Model
fin_forecast <- forecast(fin_mod, h = 10) # Forecast the next 10 steps
# Summarizing Model
summary(fin_forecast)
##
## Forecast method: ARIMA(15,1,0)
##
## Model Information:
## Series: GS1M_values
## ARIMA(15,1,0)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7 ar8 ar9
## 0.1860 0.0611 0.2019 0.1371 -0.1744 0.1911 0.1724 -0.0343 0.0972
## s.e. 0.0607 0.0631 0.0627 0.0643 0.0650 0.0651 0.0660 0.0679 0.0743
## ar10 ar11 ar12 ar13 ar14 ar15
## -0.1258 0.0322 0.126 -0.1692 0.0200 -0.0579
## s.e. 0.0755 0.0750 0.075 0.0737 0.0743 0.0715
##
## sigma^2 = 0.03943: log likelihood = 60.29
## AIC=-88.58 AICc=-86.43 BIC=-31
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.002635001 0.1926092 0.109349 -Inf Inf 0.9223443 -0.000969829
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 272 5.653839 5.399374 5.908304 5.264669 6.043009
## 273 5.448865 5.054109 5.843620 4.845138 6.052591
## 274 5.475821 4.963759 5.987884 4.692690 6.258953
## 275 5.681008 5.040392 6.321625 4.701270 6.660746
## 276 5.417362 4.639166 6.195559 4.227214 6.607511
## 277 5.469386 4.583646 6.355127 4.114763 6.824010
## 278 5.460479 4.453713 6.467246 3.920763 7.000196
## 279 5.413923 4.263843 6.564003 3.655027 7.172819
## 280 5.305388 4.021094 6.589682 3.341230 7.269546
## 281 5.375244 3.950643 6.799844 3.196505 7.553982
layout(matrix(1:2, nrow = 1))
plot(fin_forecast)
plot(fin_forecast, main = "Zoomed In" ,xlim = c(250, 300))
# Create an empty data frame to store results
results_df <- data.frame(p = numeric(), q = numeric(), AIC = numeric())
# Define the range of p and q values
p_values <- 1:10
q_values <- 1:10
# Loop through each combination of p and q values
for (p in p_values) {
for (q in q_values) {
# Fit ARIMA model
arima_model <- Arima(GS1M_values, order = c(p, 1, q))
# Get AIC
aic <- AIC(arima_model)
# Append results to data frame
results_df <- rbind(results_df, data.frame(p = p, q = q, AIC = aic))
}
}
# Print the first few rows of the results
print(head(results_df))
## p q AIC
## 1 1 1 -78.45630
## 2 1 2 -76.78491
## 3 1 3 -80.09598
## 4 1 4 -74.56527
## 5 1 5 -73.61364
## 6 1 6 -93.18428
#3D plot of p and q parameters of ARIMA(p,1,q) AIC scores
plot = scatter3d(x = results_df$p, y = results_df$AIC, z = results_df$q)
## Loading required namespace: mgcv
## Loading required namespace: MASS
The 3D scatter plots shows that p = 5, q = 4 yields the ARIMA(p,1,q) with the lowest AIC and is therefore the best model to use with a lag of one. We can also the relationship as there is the further lags seem to negatively impact the AIC score. Here is a similar interactive graph below but is somewhat harder to interpret.
plot_ly(data = results_df, x = ~p, y = ~AIC, z = ~q, type = "scatter3d", mode = "markers")